Load required packages

library(data.table) #reading in the data
library(dplyr) #dataframe manipulation
library(ggplot2) #viz
library(ranger) #the random forest implementation
library(plotly) #3D plotting
library(tidyr) #dataframe manipulation
library(FNN) #k nearest neighbors algorithm
library(xgboost)

Load data, count total #, view columns

fb <- fread("~/PycharmProjects/kaggle-project/facebook/train.csv", integer64 = "character", showProgress = FALSE)
nrow(fb)
## [1] 29118021
head(fb, 3)
##    row_id      x      y accuracy   time   place_id
## 1:      0 0.7941 9.0809       54 470702 8523065625
## 2:      1 5.9567 4.7968       13 186555 1757726713
## 3:      2 8.3078 7.0407       74 322648 1137537235
summary(fb)
##      row_id               x                y             accuracy      
##  Min.   :       0   Min.   : 0.000   Min.   : 0.000   Min.   :   1.00  
##  1st Qu.: 7279505   1st Qu.: 2.535   1st Qu.: 2.497   1st Qu.:  27.00  
##  Median :14559010   Median : 5.009   Median : 4.988   Median :  62.00  
##  Mean   :14559010   Mean   : 5.000   Mean   : 5.002   Mean   :  82.85  
##  3rd Qu.:21838515   3rd Qu.: 7.461   3rd Qu.: 7.510   3rd Qu.:  75.00  
##  Max.   :29118020   Max.   :10.000   Max.   :10.000   Max.   :1033.00  
##       time          place_id        
##  Min.   :     1   Length:29118021   
##  1st Qu.:203057   Class :character  
##  Median :433922   Mode  :character  
##  Mean   :417010                     
##  3rd Qu.:620491                     
##  Max.   :786239

Abstract sample area

fb %>% filter(x >1, x <1.25, y >2.5, y < 2.75) -> fb_s
nrow(fb_s)  
## [1] 17710
fb %>% filter(x >3, x <3.25, y >2.5, y < 2.75) -> fb_s2
nrow(fb_s2)  
## [1] 15929

Observe data

place_id

Since target is to classify place_id, 1st to observe place_id From figures: horizon vs continuously increase

par(mfrow=c(3,1))
plot(sort(fb_s$place_id))
plot(sort(fb_s2$place_id))
plot(sort(fb_s2$place_id)[0:2000])

From figures: seem no relation between place_id and accuracy

par(mfrow=c(2,1))
r_pla_aur=sort(fb_s2$place_id,index.return=TRUE)
plot(r_pla_aur$x[0:2000])
d=r_pla_aur$ix[0:2000]
plot(fb_s2[d,"accuracy"])

par(mfrow=c(2,1))
r_pla_aur=sort(fb_s2$place_id,index.return=TRUE)
plot(r_pla_aur$x[500:750])
d=r_pla_aur$ix[500:750]
plot(fb_s2[d,"accuracy"])

time

fb_s$hour = (fb_s$time/60) %% 24
fb_s$weekday = (fb_s$time/(60*24)) %% 7
fb_s$month = (fb_s$time/(60*24*30)) %% 12 #month-ish
fb_s$year = fb_s$time/(60*24*365)
fb_s$day = fb_s$time/(60*24) %% 365
head(fb_s)
##   row_id      x      y accuracy   time   place_id      hour   weekday
## 1    600 1.2214 2.7023       17  65380 6683426742  9.666667 3.4027778
## 2    957 1.1832 2.6891       58 785470 6683426742 11.166667 6.4652778
## 3   4345 1.1935 2.6550       11 400082 6889790653 20.033333 4.8347222
## 4   4735 1.1452 2.6074       49 514983 6822359752 15.050000 0.6270833
## 5   5580 1.0089 2.7287       19 732410 1527921905 14.833333 4.6180556
## 6   6090 1.1140 2.6262       11 145507 4000153867  1.116667 3.0465278
##       month      year       day
## 1  1.513426 0.1243912  189.5072
## 2  6.182176 1.4944254 2276.7246
## 3  9.261157 0.7611910 1159.6580
## 4 11.920903 0.9798002 1492.7043
## 5  4.953935 1.3934741 2122.9275
## 6  3.368218 0.2768398  421.7594

Train model

Split data

small_train = fb_s[fb_s$time < 7.3e5,]
small_val = fb_s[fb_s$time >= 7.3e5,] 

visualize 2D: small_train

ggplot(small_train, aes(x, y )) +
    geom_point(aes(color = place_id)) + 
    theme_minimal() +
    theme(legend.position = "none") +
    ggtitle("Check-ins colored by place_id")

Count by place_id

sort((small_train %>% count(place_id))$n, decreasing = T)[0:140]
##   [1] 968 848 784 757 651 603 560 512 495 463 378 312 247 243 214 204 199
##  [18] 198 179 175 171 167 165 156 155 153 149 134 127 125 124 115 114 113
##  [35] 111 106 104 104  96  95  92  89  89  87  87  84  83  82  82  79  79
##  [52]  76  72  72  71  70  67  64  61  60  54  54  54  53  53  49  48  47
##  [69]  46  46  45  43  42  41  40  38  36  34  34  34  32  32  32  31  30
##  [86]  30  30  30  28  27  26  26  26  25  25  24  24  24  23  23  23  22
## [103]  21  21  21  20  20  20  19  19  18  17  15  15  15  15  15  13  13
## [120]  13  13  13  12  12  12  11  11  11  10  10  10  10  10  10   9   9
## [137]   9   9   9   9

visualize 3D: small_train with place_id count>500 z=hour

small_train %>% count(place_id) %>% filter(n > 500) -> ids
  #if n>200, warning: n too large, allowed maximum for palette Set2 is 8
small_trainz = small_train[small_train$place_id %in% ids$place_id,]

plot_ly(data = small_trainz, x = x , y = y, z = hour, color = place_id,  type = "scatter3d", mode = "markers", marker=list(size= 5)) %>% layout(title = "Place_id's by position and Time of Day")

z=week

plot_ly(data = small_trainz, x = x , y = y, z = weekday, color = place_id,  type = "scatter3d", mode = "markers", marker=list(size= 5)) %>% layout(title = "Place_id's by position and Day of Week")

Random Forest

Count unique place_id

length(unique(small_train$place_id))
## [1] 770

Ignore fewer place_id

small_train %>% count(place_id) %>% filter(n > 3) -> ids
small_train = small_train[small_train$place_id %in% ids$place_id,]

optimal weights for scaling your variables since knn is sensitive to the magnitutde of variables@¥: s,l,w

summary(small_train)
##      row_id               x               y            accuracy      
##  Min.   :     600   Min.   :1.000   Min.   :2.500   Min.   :   1.00  
##  1st Qu.: 7327896   1st Qu.:1.049   1st Qu.:2.575   1st Qu.:  24.00  
##  Median :14412209   Median :1.123   Median :2.644   Median :  62.00  
##  Mean   :14500685   Mean   :1.123   Mean   :2.633   Mean   :  80.27  
##  3rd Qu.:21625800   3rd Qu.:1.191   3rd Qu.:2.688   3rd Qu.:  75.00  
##  Max.   :29112154   Max.   :1.250   Max.   :2.750   Max.   :1000.00  
##       time          place_id              hour           weekday        
##  Min.   :   203   Length:15595       Min.   : 0.000   Min.   :0.000694  
##  1st Qu.:163173   Class :character   1st Qu.: 6.683   1st Qu.:1.767014  
##  Median :371294   Mode  :character   Median :11.933   Median :3.287500  
##  Mean   :366123                      Mean   :12.017   Mean   :3.422029  
##  3rd Qu.:565108                      3rd Qu.:17.533   3rd Qu.:5.152431  
##  Max.   :729969                      Max.   :23.983   Max.   :6.999306  
##      month                year                day           
##  Min.   : 0.000532   Min.   :0.0003862   Min.   :   0.5884  
##  1st Qu.: 1.846852   1st Qu.:0.3104509   1st Qu.: 472.9652  
##  Median : 3.725116   Median :0.7064193   Median :1076.2145  
##  Mean   : 4.589201   Mean   :0.6965804   Mean   :1061.2251  
##  3rd Qu.: 7.300185   3rd Qu.:1.0751684   3rd Qu.:1637.9957  
##  Max.   :11.999907   Max.   :1.3888299   Max.   :2115.8522
s = 2
l = 125
w = 500

create_matrix = function(train) {
    cbind(s*train$y,
          train$x,
          train$hour/l,
          train$weekday/w,
          train$year/w,
          train$month/w,
          train$time/(w*60*24*7))
    }

X = create_matrix(small_train)
X_val = create_matrix(small_val)

KNN

model_knn = FNN::knn(train = X, test = X_val, cl = small_train$place_id, k = 15)

preds <- as.character(model_knn)
truth <- as.character(small_val$place_id)
mean(truth == preds)
## [1] 0.5151964
head(X)
##        [,1]   [,2]        [,3]        [,4]         [,5]        [,6]
## [1,] 5.4046 1.2214 0.077333333 0.006805556 0.0002487823 0.003026852
## [2,] 5.3100 1.1935 0.160266667 0.009669444 0.0015223820 0.018522315
## [3,] 5.2148 1.1452 0.120400000 0.001254167 0.0019596005 0.023841806
## [4,] 5.2524 1.1140 0.008933333 0.006093056 0.0005536796 0.006736435
## [5,] 5.0006 1.1449 0.135600000 0.005412500 0.0012038699 0.014647083
## [6,] 5.0374 1.2015 0.128266667 0.007336111 0.0026283181 0.007977870
##            [,7]
## [1,] 0.01297222
## [2,] 0.07938135
## [3,] 0.10217917
## [4,] 0.02887044
## [5,] 0.06277321
## [6,] 0.13704802

Random Forest

set.seed(131L)
small_train$place_id <- as.factor(small_train$place_id) # ranger needs factors for classification
model_rf <- ranger(place_id ~ x + y + accuracy + hour + weekday + month + year,
                   small_train,
                   num.trees = 100,
                   write.forest = TRUE,
                   importance = "impurity")
## Growing trees.. Progress: 64%. Estimated remaining time: 17 seconds.
pred = predict(model_rf, small_val)
pred = pred$predictions
accuracy = mean(pred == small_val$place_id) 

accuracy
## [1] 0.5507784

Visualize RF accuracy It does seem that the correctly identified check-ins are more “clustered” while the wrongly identified ones are more uniformly distributed but other than that no clear patters here.

small_val$Correct = (pred == small_val$place_id)

ggplot(small_val, aes(x, y )) +
    geom_point(aes(color = Correct)) + 
    theme_minimal() +
    scale_color_brewer(palette = "Set1")

look at what kind of id’s our random forest gets wrong We see below that our model is doing actually really great on the more popular id’s(more blue on the right). However it loses when it looks at id’s that appear only a few times.

#reordering the levels based on counts:
small_val$place_id <- factor(small_val$place_id,
                             levels = names(sort(table(small_val$place_id), decreasing = TRUE)))

small_val %>% 
    ggplot(aes(x = place_id)) + geom_bar(aes(fill = Correct)) + 
    theme_minimal() +
    theme(axis.text.x = element_blank()) +
    ggtitle("Prediction Accuracy by ID and Popularity") +
    scale_fill_brewer(palette = "Set1")

importance of our variables 1. y variable is more important than the x This means that the y axis is a better predictior of place_id and the random forest figures this out on its own. 2. hour and other time features are also good predictiors but less so than the spatial features - this makes sense since the location of a check-in should be more important than the time of the check-in. 3. Accuracy is a bit misterious since we don’t get an actual definition for it, but at least the model tells us it’s somewhat important.

data.frame(as.list(model_rf$variable.importance)) %>% gather() %>% 
    ggplot(aes(x = reorder(key, value), y = value)) +
    geom_bar(stat = "identity", width = 0.6, fill = "grey") +
    coord_flip() +
    theme_minimal() +
    ggtitle("Variable Importance (Gini Index)") +
    theme(axis.title.y = element_blank())